from BasicLibraries import *
from Functions.Miscellaneous import utils as ut
import self_selectivity as ssv
import make_strategy as ms
import jinja2




rolling_window_size = 3

def get_competition(X):
    sector_level = 'GICS_level_3'
    X[sector_level] = X[sector_level].replace(['Construction & Negineering', 'Construction & Engineering'] )
    z = X[['sale_usd', 'fyear', sector_level]].groupby(['fyear', sector_level]).sum().reset_index()
    z = z.rename(columns = {'sale_usd': 'sector_sale'})
    z['mrg_C'] = z['fyear'].astype(int).astype(str)+'-'+z[sector_level]
    X['mrg_C'] = X['fyear'].astype(int).astype(str)+'-'+X[sector_level]
    X = X.merge(z[['mrg_C', 'sector_sale']])
    X['competition'] = 1. - X['sale_usd']/X['sector_sale']
    return X

def get_control_factors():

    #== Compustat fundamentals
    comp = pd.read_csv('local_data/Compustat_Data_0624.csv', low_memory=False)
    #====
    comp['mrg'] = ut.utils().make_mrg(comp, 'fyear')
    cmp_for_ss = comp.copy()
    comp['Tangibility'] = comp['ppent_usd']/comp['at_usd']
    comp = comp.sort_values(by = ['gvkey', 'fyear'])
    comp['at_lagged'] = comp[['at_usd', 'gvkey']].groupby('gvkey').shift(1)
    comp['Profitability'] = comp['ebitda_usd']/comp['at_lagged']
    comp['investment'] = comp['dltt_usd'] + comp['ceq_usd'] + comp['mib_usd'].replace([np.nan], [0])
    comp['inv_int'] = comp['capx_usd']/comp['sale_usd']
    comp = get_competition(comp)
    vars_ = ['competition', 'Profitability', 'Tangibility', 'firm_size', 'inv_int', 'at_usd', 'at_lagged', 'capx_usd']
    c_rol = comp[vars_+ ['fyear', 'gvkey']].set_index('fyear').groupby('gvkey').rolling(rolling_window_size).mean().reset_index()
    c_rol['mrg'] = ut.utils().make_mrg(c_rol, 'fyear')
    c_rol.columns = c_rol.keys()+'_rw'
    comp = comp.merge(c_rol.drop(columns = ['gvkey_rw', 'fyear_rw']).rename(columns = {'mrg_rw': 'mrg'}))

    #== Eikon fundamentals
    db = pd.read_csv('local_data/Eikon_fundamentals_0624.csv')
    db['Leverage'] = (db.debt_total)/(db['at'] - db['ceq'] + db['mkt_cap'])
    db['MTB'] = (db['at'] - db['ceq'] + db['mkt_cap'])/db['at']
    db = db[(db.Leverage >=0) ]
    db = db[(db.MTB < db.MTB.quantile(0.99)) ]
    db['mrg'] = ut.utils().make_mrg(db, 'fyear')
    vars_ = ['Leverage', 'MTB']
    r_rol = db[vars_+ ['fyear', 'gvkey']].set_index('fyear').groupby('gvkey').rolling(rolling_window_size).mean().reset_index()
    r_rol['mrg'] = ut.utils().make_mrg(r_rol, 'fyear')
    r_rol.columns = r_rol.keys()+'_rw'
    db = db.merge(r_rol.drop(columns = ['gvkey_rw', 'fyear_rw']).rename(columns = {'mrg_rw': 'mrg'}))
    
    #== Mills ratio
    IMR, hc_model = ssv.SelfSelection(cmp_for_ss)
    IMR = IMR[['mrg', 'Mills_Ratio']]
    hc  = hc_model.transpose()
    hc = hc.rename(columns = {'disclosure_probGICS_level_1': 'Prob.Sect.Disclosure',
                              'disclosure_probmacro_regions': 'Prob.Reg.Disclosure',
                              'firm_size:': 'Size',
                              'investment': 'Invested capital'})
    hc.index = ['']
    print(hc.to_latex(escape=False))
    
    #=== ESG ratings 
    esg=0

    #=== Emissions
    tc = pd.read_csv('local_data/TruCost_DFT_0423.csv')
    tc = tc.sort_values(by = ['gvkey', 'cyear'])
    tc = tc[['gvkey', 'cyear', 'DirectControl', 'mrg', 'GICS_level_1']]
    tc = tc.sort_values(by = ['gvkey', 'cyear'])
    t_rol = tc[['gvkey', 'cyear', 'DirectControl']].set_index('cyear').groupby('gvkey').rolling(rolling_window_size).mean().reset_index()
    t_rol['mrg'] = ut.utils().make_mrg(t_rol, 'cyear')
    t_rol.columns = t_rol.keys()+'_rw'
    tc = tc.merge(t_rol.drop(columns = ['gvkey_rw', 'cyear_rw']).rename(columns = {'mrg_rw': 'mrg'}))

    #=== Earnings at risk
    EBITDA_AR = pd.read_csv('local_data/Earnings_at_risk_0623.csv')
    EBIT_AR = pd.read_csv('local_data/EBIT_at_risk_0623.csv')
    
    
    return comp, db, IMR, esg, tc, EBITDA_AR,  EBIT_AR
def remove_duplicated_fingerprints(x):
    #=== Find duplicated fingerprings
    s = x[['md5_fingerprint', 'mrg']].groupby('md5_fingerprint').count().sort_values(by = 'mrg')
    s = s[s.mrg > 1].dropna()

    #=== Remove duplicates by giving priority to downloaded files
    to_remove = []
    for i in s.index:
        tmp = x[x.md5_fingerprint == i]
        types = list(tmp.data_type.unique())
        if len(types) == 1:
            to_remove.extend(tmp['mrg'].iloc[:-1].values.tolist())
        elif len(types) > 1:
            to_remove.extend(tmp[tmp.data_type == 'crawled']['mrg'].values.tolist())
            tmp2 = tmp[tmp.data_type != 'crawled']
            types2 = list(tmp2.data_type)
            if len(types2) > 1:
                to_remove.extend(tmp['mrg'].iloc[:-1].values.tolist())

    #=== Remove duplicates
    x = x[x.mrg.isin(to_remove) == False]
    return x
def get_behaviour(initiatives_to_remove, missing_sdgs):
    b = pd.read_csv('local_data/LC_dataset_v_1_1L.csv')
    b = b[b.rfyear < 2022]
    b = b[b.columns.drop(list(b.filter(regex='SDG_SREC:')))]
    b = b[b.columns.drop(list(b.filter(regex='TYPE_SREC:')))]
    b = b[b.columns.drop(list(b.filter(regex='SDG:')))]
    b = b[b.columns.drop(list(b.filter(regex='TYPE:')))]
    b = b[b.columns.drop(list(b.filter(regex='stakeholder_recipient_')))]
    if len(missing_sdgs) > 0:
        b = b.drop(columns = initiatives_to_remove)
    post_gvkey = list(b[b.rfyear >= 2020].gvkey.unique())
    idx = b[b.rfyear < 2020][['gvkey', 'rfyear']].groupby('gvkey').count()
    idx = list(idx[idx > 3].dropna().index)
    b = b[b.gvkey.isin(np.unique(post_gvkey+idx))]
    b = b[b.GICS_level_1.isin(['Industrial', 'Material', 'Energy', 'Utilities'])]
    b = remove_duplicated_fingerprints(b)
    return b


def get_entropy(X, I):  
    e=0
    for i in I:
        i = i+'_%'
        if X[i] != 0:
            e+= (X[i]*np.log(1/X[i]))
        else:
            e+=0   
    return e

def get_segments(X, I):
    segments = 0
    for i in I:
        i = i+'_%'
        if X[i] != 0:
            segments+=1
    return segments/np.log(segments)
def get_entropy_segments(X, I):  
    e=0
    segments = 0
    for i in I:
        i = i+'_%'
        if X[i] != 0:
            e+= (X[i]*np.log(1/X[i]))
            segments+=1
        else:
            e+=0
    if segments <= 1:
        return 0
    else:
        return e*segments/np.log(segments)

def get_simpson(X, I):  
    e=0
    for i in I:
        i = i+'_%'
        e+= (X[i]**2)
    if e == 0:
        return np.nan
    else:
        return 1. - e
def merge_with_behavioural_data(initiatives, initiatives_sdgs, missing_sdgs, sdgs, initiatives_to_remove, \
                                comp, db, IMR, esg, tc,  EAR_, \
                                    diversification_measure = 'entropy',
                                    impact_type = 'emissions', scenario_type = 'low', 
                                    scenario_horizon = 2030, 
                                    sector_normalisation = True, 
                                    years_FE=True):
    #==
    b = get_behaviour(initiatives_to_remove, missing_sdgs)
    b = b[b.number_of_initiatives > 5]

    #==
    dep = pd.read_csv('local_data/Depreciation_data.csv')
    dep['d'] = dep['Depreciation_total']/dep['at']
    dep['mrg'] = ut.utils().make_mrg(dep, 'fyear')

    #== Emissions
    tc = tc.sort_values(by = ['gvkey', 'cyear'])
    tc['DirectControl_lag'] = tc[['gvkey', 'DirectControl']].groupby('gvkey').shift(1)
    tc['DirectControl_lag_rw'] = tc[['gvkey', 'DirectControl_rw']].groupby('gvkey').shift(1)
    tc['DirectControl_fut_1'] = tc[['gvkey', 'DirectControl']].groupby('gvkey').shift(-1)
    tc['DirectControl_fut_2'] = tc[['gvkey', 'DirectControl']].groupby('gvkey').shift(-2)
    tc['DirectControl_fut_tot'] = tc[['DirectControl_fut_1', 'DirectControl_fut_2']].sum(axis  = 1, skipna=False).replace([0], [np.nan])



    #== Merge them
    cm_var = ['competition', 'inv_int', 'at_usd', 'at_lagged', 'Tangibility', 'Profitability', 'firm_size', 'capx_usd']
    cm_rol = list(np.char.add(cm_var, '_rw'))
    dt = b.merge(comp[['mrg', 'investment', 'sale_usd']+cm_var+cm_rol]).\
                 merge(db[['Leverage', 'MTB', 'Leverage_rw', 'MTB_rw', 'mrg']], on = 'mrg').\
                     merge(IMR).\
                         merge(tc[['DirectControl', 'DirectControl_fut_1', 'DirectControl_fut_2',  'DirectControl_fut_tot', 'DirectControl_lag', 'DirectControl_lag_rw', 'mrg']])

    #== Other type of impact
    if impact_type == 'EAR':
        #== Earnings at risk    
        Z = EAR_.copy()
        Z = Z[Z.scenario == scenario_type]
        Z = Z[Z.horizon == scenario_horizon]
        Z['mrg'] = Z['gvkey'].astype(int).astype(str)+'-'+Z['cyear'].astype(int).astype(str)
        Z['EAR'] = Z['EAR'].replace(['No', 'Yes'], [0,1])
        dt = dt.merge(Z[['EAR', 'horizon', 'mrg']])
    #== Other type of impact
    if impact_type == 'EMR':
        #== Earnings at risk    
        Z = EAR_.copy()
        Z = Z[Z.scenario == scenario_type]
        Z = Z[Z.horizon == scenario_horizon]
        Z['mrg'] = Z['gvkey'].astype(int).astype(str)+'-'+Z['cyear'].astype(int).astype(str)
        dt = dt.merge(Z[['EMR', 'horizon', 'mrg']])    
    #== Clean up corners
    dt = dt[dt.capx_usd >1]
    
    
    #== Concentration measure
    for i in initiatives:
        init = [col for col in dt.columns if i in col]
        dt[i] = dt[init].sum(axis = 1)
    for i in initiatives_sdgs:
        dt[i+'_%'] = dt[i]/dt[initiatives].sum(axis = 1).dropna()
    dt['total_effort'] = dt[initiatives_sdgs].sum(axis = 1)




       
    if diversification_measure == 'entropy':
        dt['diversification'] = dt.apply(lambda x: get_entropy(x, initiatives_sdgs), axis = 1)
    elif diversification_measure == 'entropy_segments':
        dt['diversification'] = dt.apply(lambda x: get_entropy_segments(x, initiatives_sdgs), axis = 1)
        dt['segs'] = dt.apply(lambda x: get_segments(x, initiatives_sdgs), axis = 1)
    elif diversification_measure == 'simpson':
        dt['diversification'] = dt.apply(lambda x: get_simpson(x, initiatives_sdgs), axis = 1)
    
    
    
    dt['MacroRegionShort'] = dt['MacroRegion']
    dt['MacroRegionShort'] = dt['MacroRegionShort'].replace(['Latin America and Caribbean'], ['United States and Canada'])
    dt['MacroRegionShort'] = dt['MacroRegionShort'].replace(['Africa', 'Middle-East'], ['Europe', 'Europe'])   
    dt = dt[dt.MacroRegion.isin(['Asia-Pacific', 'Europe', 'United States and Canada'])]
    dt['MacroRegionShort'] = dt['MacroRegionShort'].replace(['United States and Canada'], ['Americas'])

    
    
    #=== Should you normalise the diversification measure?
    if sector_normalisation:
        print('Normalising by sector')
        industry_average = dt[['diversification', 'rfyear', 'GICS_level_3', 'MacroRegionShort']].groupby(['rfyear', 'GICS_level_3', 'MacroRegionShort']).mean().reset_index()
        industry_average['mrg_i'] = industry_average['rfyear'].astype(int).astype(str)+'-'+industry_average['GICS_level_3']+'-'+industry_average['MacroRegionShort']
        dt['mrg_i'] = dt['rfyear'].astype(int).astype(str)+'-'+dt['GICS_level_3']+'-'+dt['MacroRegionShort']
        dt = dt.merge(industry_average.rename(columns = {'diversification': 'industry_diversification'}).reset_index())
        dt = dt.sort_values(by = ['gvkey', 'rfyear']).reset_index(drop=True)
        dt['diversification']/=dt.industry_diversification


    for S in ['Risk mitigation','Stakeholders engagement', 'Innovation']:
         ST, TE = make_strategy_diversification(dt, S, sdgs, diversification_measure, sector_normalisation)
         dt[S] = ST 
         dt['total_effort_'+S] = TE
    #== make the rolling values
    for S in ['total_effort', 'diversification', 'Risk mitigation','Stakeholders engagement', 'Innovation']:
        dt = dt.groupby('mrg').last().reset_index()
        dt = dt.sort_values(by = ['gvkey', 'rfyear'])
        t_rol = dt[['gvkey', 'rfyear', S]].set_index('rfyear').groupby('gvkey').rolling(rolling_window_size).mean().reset_index()
        t_rol['mrg'] = ut.utils().make_mrg(t_rol, 'rfyear')
        t_rol.columns = t_rol.keys()+'_rw'
        dt = dt.merge(t_rol.drop(columns = ['gvkey_rw', 'rfyear_rw']).rename(columns = {'mrg_rw': 'mrg'}))

    #==
    sec = pd.get_dummies(dt.GICS_level_1, drop_first = True)
    yer = pd.get_dummies(dt.rfyear, drop_first = True)
    geo = pd.get_dummies(dt['loc'], drop_first = True)
    
    sources = pd.read_csv('local_data/emissions_sources.csv')
    src = pd.get_dummies(sources['data_source'])
    src = src.drop(columns = [1])
    src['mrg'] = sources['mrg']
    src = src.rename(columns = {0: 'Estimated', 0.5: 'Mixed'})
    dt = dt.merge(src)
    dummies =  list(sec.columns) + list(geo.columns) + ['Estimated', 'Mixed']
    if years_FE:
        print('Adding year fixed effects')
        dummies = dummies + list(yer.columns)
    dt = pd.concat((dt, sec, yer, geo), axis = 1)
    dt = dt.groupby('mrg').last().reset_index()
    
    #=== Add volatility
    vol = pd.read_csv('local_data/volatility_file.csv')
    dt = dt.merge(vol[['mrg', 'volatility_rw', 'ret_usd_rw', 'volatility', 'ret_usd']])
    return dt, dummies




def make_strategy_diversification(Y, strategy_type, sdgs, diversification_measure, sector_normalisation = True):
    X = Y.copy()

    if strategy_type == 'Innovation':
        target_initiatives =  ['r&d investments', 'new products', 'association', 'organizational structuring'] 
    elif strategy_type == 'Risk mitigation':
        target_initiatives =  ['modification of procedures', 'asset modification', 'training',  'assessment and measurement', 'adoption of standards and rules']
    elif strategy_type == 'Stakeholders engagement':
        target_initiatives =  [ 'donation & funding', 'communication', 'volunteerism', 'incentives', 'pricing']


    if diversification_measure == 'entropy':
        target_initiatives_pct = list(np.ravel([[T+' - SDG '+str(S) for S in sdgs] for T in target_initiatives]))
        X['Strategy'] = X.apply(lambda x: get_entropy(x, target_initiatives_pct), axis = 1)
        X['total_effort_'+strategy_type] = X[target_initiatives_pct].sum(axis = 1)
    elif diversification_measure == 'entropy_segments':
        target_initiatives_pct = list(np.ravel([[T+' - SDG '+str(S) for S in sdgs] for T in target_initiatives]))
        X['Strategy'] = X.apply(lambda x: get_entropy(x, target_initiatives_pct), axis = 1)
        X['Strategy'] = X['Strategy']*X['segs']       
        X['total_effort_'+strategy_type] = X[target_initiatives_pct].sum(axis = 1)
    elif diversification_measure == 'simpson':
        target_initiatives_pct = list(np.ravel([[T+' - SDG '+str(S) for S in sdgs] for T in target_initiatives]))
        X['Strategy'] = X.apply(lambda x: get_simpson(x, target_initiatives_pct), axis = 1)
        X['total_effort_'+strategy_type] = X[target_initiatives_pct].sum(axis = 1)    
    
    #=== Should you normalise the diversification measure?
    if sector_normalisation:
        print('Normalising ',strategy_type,' by sector')
        print('Normalising by sector')
        industry_average = X[['Strategy', 'rfyear', 'GICS_level_3', 'MacroRegionShort']].groupby(['rfyear', 'GICS_level_3', 'MacroRegionShort']).mean().reset_index()
        industry_average['mrg_i'] = industry_average['rfyear'].astype(int).astype(str)+'-'+industry_average['GICS_level_3']+'-'+industry_average['MacroRegionShort']
        X['mrg_i'] = X['rfyear'].astype(int).astype(str)+'-'+X['GICS_level_3']+'-'+X['MacroRegionShort']
        X = X.merge(industry_average.rename(columns = {'Strategy': 'industry_diversification_strategy'}).reset_index())
        X = X.sort_values(by = ['gvkey', 'rfyear']).reset_index(drop=True)
        X['Strategy']/=X.industry_diversification_strategy
    
    Strategy = X['Strategy']
    TE = X['total_effort_'+strategy_type]


    return Strategy, TE






#%%
def get_initiatives(sdgs):
    initiatives = ut.utils().return_Initiatives()
    all_sdgs = [i for i in range(1,18)]
    missing_sdgs = list(set(all_sdgs) ^ set(sdgs))
    initiatives_sdgs  = list(np.array(np.ravel([[i+' - SDG '+str(S) for i in initiatives] for S in sdgs])))
    initiatives_to_remove = list(np.array(np.ravel([[i+' - SDG '+str(S) for i in initiatives] for S in missing_sdgs])))
    return initiatives, missing_sdgs, initiatives_sdgs, initiatives_to_remove

def make_the_strategies_variables(dt, strategy_type = 'intensity'):
    targets = ['diversification', 'Total', 'Risk mitigation', \
                  'Stakeholders engagement', 'Innovation']
    tab = pd.DataFrame()
    M = dt.copy()
    for target_var in targets:
        if target_var != 'diversification' and target_var != 'concentration':
            S = ms.make_strategy(dt, strategy_type, target_var)
            S = S.groupby('mrg').last().reset_index()
            M = M.merge(S).rename(columns = {'Strategy': target_var})
            M = M.sort_values(by = ['gvkey', 'rfyear'])
            #== make the rolling values
            t_rol = M[['gvkey', 'rfyear', target_var]].set_index('rfyear').groupby('gvkey').rolling(rolling_window_size).mean().reset_index()
            t_rol['mrg'] = ut.utils().make_mrg(t_rol, 'rfyear')
            t_rol.columns = t_rol.keys()+'_rw'
            M = M.merge(t_rol.drop(columns = ['gvkey_rw', 'rfyear_rw']).rename(columns = {'mrg_rw': 'mrg'}))
    return M